This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

loading_zipdata <- "~/repos/Diversity-Richness/Zip.Code.Datasets/zip_data_unedited_nolabels.csv" #making a pathway to the downloaded census data
Unedited_zipdata <- read.csv(loading_zipdata) #loading the census data
reduced_collumns_zip <- c(1:2, 33:34, 42, 47:146) #removing unnecessary columns


Zipcode_Census_data<- Unedited_zipdata[ , reduced_collumns_zip] #make a new table without unnecessary columns

colnames(Zipcode_Census_data) <- c("FIPS", "Name", "3_Digit_Tabulation", "5_Digit_Tabulation" , "Area_Name","Total_Population", "Population_Density", "Area", "Total_Population1", "Total_Population_Male", "Total_Population_Female", "X_Total_Population_Male", "X_Total_Population_Female", "Total_Population2", "Total_Population_Under_5_Years", "Total_Population_5_to_9_Years", "Total_Population_10_to_14_Years", "Total_Population_15_to_17_Years", "Total_Population_18_to_24_Years", "Total_Population_25_to_34_Years", "Total_Population_35_to_44_Years", "Total_Population_45_to_54_Years", "Total_Population_55_to_64_Years", "Total_Population_65_to_74_Years", "Total_Population_75_to_84_Years", "Total_Population_85_Years_And_Over","X_Total_Population_Under_5_Years", "X_Total_Population_5_to_9_Years", "X_Total_Population_10_to_14_Years", "X_Total_Population_15_to_17_Years", "X_Total_Population_18_to_24_Years", "X_Total_Population_25_to_34_Years", "X_Total_Population_35_to_44_Years", "X_Total_Population_45_to_54_Years", "X_Total_Population_55_to_64_Years", "X_Total_Population_65_to_74_Years", "X_Total_Population_75_to_84_Years", "X_Total_Population_85_Years_And_Over", "Total_Population3", "Total_Population_White_Alone", "Total_Population_Black_or African_American_Alone", "Total_Population_American_Indian_And_Native_Alaskan_Alone", "Total_Population_Asian_Alone", "Total_Population_Native_Hawaiian_And_Other_Pacific_Islander_Alone", "Total_Population_Some_Other_Race_Alone","Total_Population_Two_Or_More_Races","X_Total_Population_White_Alone", "X_Total_Population_Black_or_African_American_Alone", "X_Total_Population_American_Indian_And_Native_Alaskan_Alone", "X_Total_Population_Asian_Alone", "X_Total_Population_Native_Hawaiian_And_Other_Pacific_Islander_Alone", "X_Total_Population_Some_Other_Race_Alone","X_Total_Population_Two_Or_More_Races","Households","Households_Family_Households","Households_Married_Couple_Family","Households_Other_Family","Households_Male_Householder_No_Wife_Present","Households_Female_Householder_No_Husband_Present","Households_Nonfamily_Households","Nonfamily_Households_Male_Householder","Nonfamily_Households_Female_Householder","X_Households_Family_Households","X_Households_Married_Couple_Family","X_Households_Other_Family","X_Households_Male_Householder_No_Wife_Present","X_Households_Female_Householder_No_Husband_Present","X_Households_Nonfamily_Households","X_Nonfamily_Households_Male_Householder","X_Nonfamily_Households_Female_Householder","Median_Household_Income","Housing_Units","Occupied_Housing_Units","Owner_Occupied","Renter_Occupied","X_Owner_Occupied","X_Renter_Occupied","Population_With_Poverty_Status","Poverty_Status_Under_1.00","Poverty_Status_1.00_to_1.99","Poverty_Status_Under_2.00","Poverty_Status_2.00_and_Over","X_Poverty_Status_Under_1.00","X_Poverty_Status_1.00_to_1.99","X_Poverty_Status_Under_2.00","X_Poverty_Status_2.00_and_Over", "Total_Population_Last", "Total_Not_Hispanic", "Total_Not_Hispanic_White_Alone", "Total_Not_Hispanic_Black_Alone", "Total_Not_Hispanic_American_Indian_Native_Alaskan_Alone","Total_Not_Hispanic_Asian_Alone","Total_Not_Hispanic_Native_Hawaiian_Pacific_Islander","Total_Not_Hispanic_Other_Race","Total_Not_Hispanic_Two_or_More_Races", "Total_Hispanic","X_Not_Hispanic", "X_Not_Hispanic_White_Alone", "X_Not_Hispanic_Black_Alone", "X_Not_Hispanic_American_Indian_Native_Alaskan_Alone","X_Not_Hispanic_Asian_Alone","X_Not_Hispanic_Native_Hawaiian_Pacific_Islander","X_Not_Hispanic_Other_Race","X_Not_Hispanic_Two_or_More_Races", "X_Hispanic")

#writing usable column names
write.csv(Zipcode_Census_data, file = "~/repos/Diversity-Richness/Zip.Code.Datasets/Zipcode_Census_data.csv", row.names = FALSE)
#saving the table as a csv file
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

values <- c("Percent White" = "red","Percent African American" = "blue",  "Percent Asian" = "green", "Percent Native American" = "purple", "Percent Hispanic" = "yellow")
text_needed <-
"Percent White = RED 
Percent African American = BLUE 
Percent Asian = GREEN 
Percent Native American = PURPLE 
Percent Hispanic = YELLOW"

Zipcode_Census_data %>%
  group_by()
ggplot(Zipcode_Census_data) +
  geom_smooth(aes(x = X_Total_Population_White_Alone, y = Median_Household_Income), fill ="white", color = "red", show.legend = T) +
  geom_smooth(aes(x = X_Total_Population_Black_or_African_American_Alone, y = Median_Household_Income), fill = "white", color = "blue", show.legend = T) +
  geom_smooth(aes(x = X_Total_Population_Asian_Alone, y = Median_Household_Income), fill = "white", color = "green", show.legend = T) +
  geom_smooth(aes(x = X_Total_Population_American_Indian_And_Native_Alaskan_Alone, y = Median_Household_Income), fill = "white", color = "purple", show.legend = T) +
  geom_smooth(aes(x = X_Hispanic, y = Median_Household_Income), fill = "white", color = "yellow", show.legend = T) +
  labs(x = "Percentage Population of Population for Each Racial Demgraphic", y = "Median Household Income",
         title = "Racial Demographics and Median Household Income", caption = text_needed) +
  theme_classic()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).

#May want to add further descriptors or learn how to add multiple lines

(ref:Income_By_Racial_Demographic_Figure)

library(tidyverse)
library(ggplot2)

ggplot(Zipcode_Census_data) +
  geom_smooth(aes(x = X_Total_Population_White_Alone, y = Median_Household_Income, color = "#A1C9F4"), fill ="#A1C9F4", alpha = 0.2, show.legend = T)  +
  geom_smooth(aes(x = X_Total_Population_Black_or_African_American_Alone, y = Median_Household_Income, color = "#FFB482"), fill = "#FFB482", alpha = 0.2, show.legend = T) + #Fixed shading
  geom_smooth(aes(x = X_Total_Population_Asian_Alone, y = Median_Household_Income, color = "#8DE5A1"), fill = "#8DE5A1", alpha = 0.2, show.legend = T) +
  geom_smooth(aes(x = X_Total_Population_American_Indian_And_Native_Alaskan_Alone, y = Median_Household_Income, color = "#D0BBFF"), fill = "#D0BBFF", alpha = 0.2, show.legend = T) +
  geom_smooth(aes(x = X_Hispanic, y = Median_Household_Income, color = "#FF9F9B"), fill = "#FF9F9B", alpha = 0.2, show.legend = T) +
  scale_color_manual(values = c( "#8DE5A1" = "#8DE5A1","#A1C9F4" = "#A1C9F4","#D0BBFF" = "#D0BBFF", "#FF9F9B" = "#FF9F9B", "#FFB482" = "#FFB482"), #Fixed order
                     name = "Race",
                     labels = c("Asian", "White", "Native American/Alaskan", "Hispanic", "Black")) +
  labs(x = "Percentage Population of Population for Each Racial Demgraphic", y = "Median Household Income", title = "Racial Demographics and Median Household Income")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
(ref:Income_By_Racial_Demographic_Figure)

(ref:Income_By_Racial_Demographic_Figure)

#Don't know how to correct red shading on key
library(tidyverse)
library(ggplot2)


ggplot(Zipcode_Census_data) +
  geom_smooth(aes(x = X_Total_Population_White_Alone, y = Median_Household_Income, color = "#A1C9F4"), fill ="#A1C9F4", alpha = 0.2, show.legend = T, method = "lm", se =T)  +
  geom_smooth(aes(x = X_Total_Population_Black_or_African_American_Alone, y = Median_Household_Income, color = "#FFB482"), fill = "#FFB482", alpha = 0.2, show.legend = T, method = "lm", se =T) +
  geom_smooth(aes(x = X_Total_Population_Asian_Alone, y = Median_Household_Income, color = "#8DE5A1"), fill = "#8DE5A1", alpha = 0.2, show.legend = T, method = "lm", se = T) +
  geom_smooth(aes(x = X_Total_Population_American_Indian_And_Native_Alaskan_Alone, y = Median_Household_Income, color = "#D0BBFF"), fill = "#D0BBFF", alpha = 0.2, show.legend = T, method = "lm", se = T) +
  geom_smooth(aes(x = X_Hispanic, y = Median_Household_Income, color = "#FF9F9B"), fill = "#FF9F9B", alpha = 0.2, show.legend = T, method = "lm", se =T) +
  scale_color_manual(values = c( "#8DE5A1" = "#8DE5A1","#A1C9F4" = "#A1C9F4","#D0BBFF" = "#D0BBFF", "#FF9F9B" = "#FF9F9B", "#FFB482" = "#FFB482"),
                     name = "Race",
                     labels = c("Asian", "White", "Native American/Alaskan", "Hispanic", "Black")) +
  labs(x = "Percentage Population of Population for Each Racial Demographic", y = "Median Household Income", title = "Racial Demographics and Median Household Income")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).

library(tidyverse)
library(ggplot2)

ggplot(Zipcode_Census_data) +
  geom_jitter(aes(x = X_Total_Population_White_Alone, y = Median_Household_Income), fill ="black", color = "#ffd4c4") + #made a jitter plot first so it not cover
   geom_smooth(aes(x = X_Total_Population_White_Alone, y = Median_Household_Income), fill ="#ffb5e2", color = "#ce70e8" ) + #then added smooth plot to track trend
  labs(x = "Percentage Population White", y = "Median Household Income",
         title = "Whiteness and Median Household Income", caption = "Data: Zipcode_Census_data")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3153 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3153 rows containing missing values (`geom_point()`).

#May want to add further descriptors or learn how to add multiple lines
Zipcode_with_Zones <- Zipcode_Census_data %>%
  mutate(Zipcode_Zone= case_when(
   substr(Name, 7, 8) == "00" ~ "Puerto Rico",
   substr(Name, 7, 7) == "0" ~ "North East",
    substr(Name, 7, 7) == "1" ~ "New York Region (Lower North East)",
    substr(Name, 7, 7) == "2" ~ "Central East Coast",
    substr(Name, 7, 7) == "3" ~ "South East",
    substr(Name, 7, 7) == "4" ~ "Great Lakes",
    substr(Name, 7, 7) == "5" ~ "Northern Midwest",
    substr(Name, 7, 7) == "6" ~ "Central Interior",
    substr(Name, 7, 7) == "7" ~ "Texas Region",
    substr(Name, 7, 7) == "8" ~ "Western Interior",
    substr(Name, 7, 7) == "9" ~ "West Coast (and Hawaii/Alaska)",
    substr(Name, 7, 8) == "00" ~ "Puerto Rico",
    TRUE ~ "Other"  # Default case
  ), .after = Area_Name)
  
Zipcode_with_States <- Zipcode_with_Zones %>%
  mutate(State_Territory= case_when(
    between(as.integer(substr(Name, 7, 9)), 039, 049) ~ "ME",
    between(as.integer(substr(Name, 7, 9)), 030, 038) ~ "NH",
    between(as.integer(substr(Name, 7, 9)), 010, 027) ~ "MA",
    between(as.integer(substr(Name, 7, 9)), 028, 029) ~ "RI",
    between(as.integer(substr(Name, 7, 9)), 150, 196) ~ "PA",
    between(as.integer(substr(Name, 7, 9)), 197, 199) ~ "DE",
    between(as.integer(substr(Name, 7, 9)), 206, 219) ~ "MD",
    between(as.integer(substr(Name, 7, 9)), 200, 205) ~ "DC",
    between(as.integer(substr(Name, 7, 9)), 220, 246) ~ "VA",
    between(as.integer(substr(Name, 7, 9)), 247, 269) ~ "WV",
    between(as.integer(substr(Name, 7, 9)), 386, 399) ~ "MS",
    between(as.integer(substr(Name, 7, 9)), 370, 385) ~ "TN",
    between(as.integer(substr(Name, 7, 9)), 700, 715) ~ "LA",
    between(as.integer(substr(Name, 7, 9)), 716, 729) ~ "AR",
    between(as.integer(substr(Name, 7, 9)), 550, 567) ~ "MN",
    between(as.integer(substr(Name, 7, 9)), 820, 831) ~ "WY",
    between(as.integer(substr(Name, 7, 9)), 832, 839) ~ "ID",
    between(as.integer(substr(Name, 7, 9)), 870, 884) ~ "NM",
    between(as.integer(substr(Name, 7, 9)), 889, 899) ~ "NV",
    between(as.integer(substr(Name, 7, 9)), 900, 961) ~ "CA",
    between(as.integer(substr(Name, 7, 9)), 980, 994) ~ "WA",
    between(as.integer(substr(Name, 7, 9)), 967, 968) ~ "HI",
    between(as.integer(substr(Name, 7, 9)), 995, 999) ~ "AK",
    between(as.integer(substr(Name, 7, 9)), 962, 966) ~ "AP",
    between(as.integer(substr(Name, 7, 9)), 006, 009) ~ "PR/VI",
    between(as.integer(substr(Name, 7, 8)), 10, 14) ~ "NY",
    between(as.integer(substr(Name, 7, 8)), 07, 08) ~ "NJ",
    between(as.integer(substr(Name, 7, 8)), 27, 28) ~ "NC",
    between(as.integer(substr(Name, 7, 8)), 30, 31) ~ "GA",
    between(as.integer(substr(Name, 7, 8)), 32, 34) ~ "FL",
    between(as.integer(substr(Name, 7, 8)), 35, 36) ~ "AL",
    between(as.integer(substr(Name, 7, 8)), 40, 42) ~ "KY",
    between(as.integer(substr(Name, 7, 8)), 43, 45) ~ "OH",
    between(as.integer(substr(Name, 7, 8)), 46, 47) ~ "IN",
    between(as.integer(substr(Name, 7, 8)), 48, 49) ~ "MI",
    between(as.integer(substr(Name, 7, 8)), 50, 12) ~ "IA",
    between(as.integer(substr(Name, 7, 8)), 53, 54) ~ "WI",
    between(as.integer(substr(Name, 7, 8)), 60, 62) ~ "IL",
    between(as.integer(substr(Name, 7, 8)), 63, 65) ~ "MO",
    between(as.integer(substr(Name, 7, 8)), 66, 67) ~ "KS",
    between(as.integer(substr(Name, 7, 8)), 68, 69) ~ "NE",
    between(as.integer(substr(Name, 7, 8)), 73, 74) ~ "OK",
    between(as.integer(substr(Name, 7, 8)), 75, 79) ~ "TX",
    between(as.integer(substr(Name, 7, 8)), 80, 81) ~ "CO",
    between(as.integer(substr(Name, 7, 8)), 85, 86) ~ "AZ",
    str_detect(Name, "05") ~ "VT",
    str_detect(Name, "06") ~ "CT",
    str_detect(Name, "29") ~ "SC",
    str_detect(Name, "57") ~ "SD",
    str_detect(Name, "58") ~ "ND",
    str_detect(Name, "59") ~ "MT",
    str_detect(Name, "84") ~ "UT",
    str_detect(Name, "97") ~ "OR",
    str_detect(Name, "09") ~ "AE",
    str_detect(Name, "340") ~ "AA",
    str_detect(Name, "969") ~ "PW/FM/MH/MP/GU",
    str_detect(Name, "96799") ~ "AS",
    TRUE ~ "Other"  # Default case
  ), .after = Zipcode_Zone) 

write.csv(Zipcode_with_States, file = "~/repos/Diversity-Richness/Zip.Code.Datasets/Zipcode_data_with_Areas_States.csv", row.names = FALSE)

(ref:Whiteness_By_Region_Boxplot)

# Create the boxplot
ggplot(Zipcode_with_Zones, aes(x = Zipcode_Zone, y = X_Total_Population_White_Alone, fill = Zipcode_Zone)) +
  geom_boxplot(outlier.shape = NA) +
  labs(x = "ZIP Code Zones", y = "Percentage White",
       title = "Percentage of Population White by Zip Code Region", fill = "Zone Names") +
  theme(axis.text.x = element_blank())
## Warning: Removed 587 rows containing non-finite values (`stat_boxplot()`).
(ref:Whiteness_By_Region_Boxplot)

(ref:Whiteness_By_Region_Boxplot)

(ref:Demographics_By_Region_Table)

Zipcode_with_States %>%
  group_by(Zipcode_Zone) %>%
  summarize(
    mean.percent.white = mean(X_Total_Population_White_Alone, na.rm = T),
    sd.percent.white = sd(X_Total_Population_White_Alone, na.rm = T),
    mean.percent.black = mean(X_Total_Population_Black_or_African_American_Alone, na.rm = T),
    sd.percent.black = sd(X_Total_Population_Black_or_African_American_Alone, na.rm = T),
    mean.percent.asian = mean(X_Total_Population_Asian_Alone, na.rm = T),
    sd.percent.asian = sd(X_Total_Population_Asian_Alone, na.rm = T),
    mean.percent.native.american = mean(X_Total_Population_American_Indian_And_Native_Alaskan_Alone, na.rm = T),
    sd.percent.native.american = sd(X_Total_Population_American_Indian_And_Native_Alaskan_Alone, na.rm = T),
    mean.percent.hispanic = mean(X_Hispanic, na.rm = T),
    sd.percent.hispanic = sd(X_Hispanic, na.rm = T),
    n.per.region = n()
  ) %>%
  knitr::kable(col.names = c("ZIP Code Region", "Percent White",    "SD White", "Percent Black",    "SD Black", "Percent Asian", "SD Asian", "Percent Native Americn", "SD Native American", "Percent Hispanic", "SD Hispanic", "ZIP Codes Per Region"), 
               digits = 2, 
               align = "lrlrlrlrlrlr",
               caption = "Average ZIP Code Demographics by Region") %>%
 kableExtra::kable_styling(latex_options = c("scale_down", "hold_position")) %>%
  kableExtra::column_spec(seq(2, 12, 2), bold = TRUE)
Average ZIP Code Demographics by Region
ZIP Code Region Percent White SD White Percent Black SD Black Percent Asian SD Asian Percent Native Americn SD Native American Percent Hispanic SD Hispanic ZIP Codes Per Region
Central East Coast 75.02 23.79 15.97 20.58 1.85 4.47 0.53 3.80 5.47 7.66 3452
Central Interior 87.65 16.50 4.12 12.27 1.34 3.58 0.58 3.36 5.78 9.86 3721
Great Lakes 88.36 16.24 5.32 13.45 1.05 2.67 0.32 1.56 3.47 5.70 3812
New York Region (Lower North East) 84.84 19.50 5.30 12.05 2.59 6.03 0.29 2.54 6.36 10.47 3726
North East 83.01 18.66 4.39 9.62 3.95 7.19 0.32 1.56 8.06 12.45 2445
Northern Midwest 89.37 15.50 1.48 5.30 1.05 2.77 3.08 12.65 3.74 5.62 3766
Puerto Rico 43.15 18.15 9.12 9.05 0.23 0.45 0.15 0.26 97.96 6.07 132
South East 68.73 24.74 21.41 24.39 1.55 3.32 0.33 1.23 9.14 14.17 3483
Texas Region 71.48 20.73 11.12 17.71 1.62 3.89 2.05 5.32 19.83 24.46 3808
West Coast (and Hawaii/Alaska) 63.64 26.01 3.04 5.78 7.91 12.41 5.51 17.96 22.06 23.07 3177
Western Interior 76.22 24.31 1.77 4.17 1.43 2.87 6.29 20.06 20.39 23.04 2252